In [1]:
# Import Dependencies
%matplotlib inline

# Start Python Imports
import math, time, random, datetime

# Data Manipulation
import numpy as np
import pandas as pd

# Visualization 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.io as pio
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Let's be rebels and ignore warnings for now
import warnings
warnings.filterwarnings('ignore')
In [2]:
from IPython.display import display
from IPython.display import HTML
from IPython.display import Image
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)

display(HTML("<style>.container { width:1600px !important; }</style>"))

# This line will hide code by default when the notebook is exported as HTML
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

# This line will add a button to toggle visibility of code blocks, for use with the HTML export version
di.display_html('''<button onclick="jQuery('.input_area').toggle(); jQuery('.prompt').toggle();">Toggle code</button>''', raw=True)

Delays by Airline¶

The following dataset was dowloaded from Kaggle

In [3]:
df_22 = pd.read_csv('Combined_Flights_2022.csv') ## Flights 2022
df_airlines = pd.read_csv('Airlines.csv') ## Flights from 2017-2022
In [4]:
df_22['Operating_Airline'].value_counts()
Out[4]:
WN    731925
DL    512114
AA    495953
OO    440807
UA    352009
YX    197881
B6    156793
MQ    150758
9E    142624
OH    135884
AS    129510
NK    129208
F9     86557
G4     73504
YV     71972
QX     56320
PT     52827
C5     44367
HA     42212
ZW     40300
G7     34793
Name: Operating_Airline, dtype: int64

How many total flights were in the USA by Airline during 2022?¶

In [5]:
airline_counts = df_22['Operating_Airline'].value_counts()
airline_counts.index = airline_counts.index.map(df_airlines.set_index('Code')['Description']).rename('Airline')
airline_counts = airline_counts.sort_values(ascending=True).to_frame("Count").reset_index()
df_22['Cancelled'] = df_22['Cancelled'].astype('bool')
In [6]:
airline_counts
Out[6]:
Airline Count
0 GoJet Airlines, LLC d/b/a United Express 34793
1 Air Wisconsin Airlines Corp 40300
2 Hawaiian Airlines Inc. 42212
3 Commutair Aka Champlain Enterprises, Inc. 44367
4 Capital Cargo International 52827
5 Horizon Air 56320
6 Mesa Airlines Inc. 71972
7 Allegiant Air 73504
8 Frontier Airlines Inc. 86557
9 Spirit Air Lines 129208
10 Alaska Airlines Inc. 129510
11 Comair Inc. 135884
12 Endeavor Air Inc. 142624
13 Envoy Air 150758
14 JetBlue Airways 156793
15 Republic Airlines 197881
16 United Air Lines Inc. 352009
17 SkyWest Airlines Inc. 440807
18 American Airlines Inc. 495953
19 Delta Air Lines Inc. 512114
20 Southwest Airlines Co. 731925
In [7]:
px.bar(
    airline_counts,
    y='Airline',
    x='Count',
    orientation='h', 
    template='plotly_dark', 
    title='Total Flights During 2022 Per Airline',
)
In [8]:
## Filters all the cancelled flights
df_22.query('Cancelled').head()
Out[8]:
FlightDate Airline Origin Dest Cancelled Diverted CRSDepTime DepTime DepDelayMinutes DepDelay ... WheelsOff WheelsOn TaxiIn CRSArrTime ArrDelay ArrDel15 ArrivalDelayGroups ArrTimeBlk DistanceGroup DivAirportLandings
949 2022-04-02 Commutair Aka Champlain Enterprises, Inc. JAX IAH True False 1842 NaN NaN NaN ... NaN NaN NaN 2031 NaN NaN NaN 2000-2059 4 0
970 2022-04-02 Commutair Aka Champlain Enterprises, Inc. IAH JAX True False 1435 NaN NaN NaN ... NaN NaN NaN 1757 NaN NaN NaN 1700-1759 4 0
982 2022-04-02 Commutair Aka Champlain Enterprises, Inc. IAH ECP True False 1426 NaN NaN NaN ... NaN NaN NaN 1611 NaN NaN NaN 1600-1659 3 0
1001 2022-04-02 Commutair Aka Champlain Enterprises, Inc. ECP IAH True False 1650 NaN NaN NaN ... NaN NaN NaN 1855 NaN NaN NaN 1800-1859 3 0
1451 2022-04-01 GoJet Airlines, LLC d/b/a United Express ITH EWR True False 700 NaN NaN NaN ... NaN NaN NaN 824 NaN NaN NaN 0800-0859 1 0

5 rows × 61 columns

In [9]:
pct_cancelled = df_22['Cancelled'].mean()*100
print(f'Of all flights in this DataSet {pct_cancelled:0.2f}% are Cancelled')
Of all flights in this DataSet 3.02% are Cancelled
In [10]:
df_22['Cancelled'].value_counts()
Out[10]:
False    3955126
True      123192
Name: Cancelled, dtype: int64

Which Airlines were most cancelled during that period? (count and percentage)¶

In [11]:
cancelled_counts = df_22.query('Year == 2022 and Cancelled').groupby(['Operating_Airline'])['Cancelled'].agg(['count','sum'])
cancelled_counts.head()
Out[11]:
count sum
Operating_Airline
9E 5819 5819
AA 18736 18736
AS 3738 3738
B6 7631 7631
C5 1518 1518
In [12]:
cancel = df_22.query('Year == 2022').groupby(['Operating_Airline','Cancelled']).size().unstack().reset_index()
cancel.head()
Out[12]:
Cancelled Operating_Airline False True
0 9E 136805 5819
1 AA 477217 18736
2 AS 125772 3738
3 B6 149162 7631
4 C5 42849 1518
In [13]:
cancel.index = cancel.index.rename(None)
cancel.rename(columns={False: 'Not Cancelled', True: 'Cancelled'}).rename_axis(None, axis=1).head()
Out[13]:
Operating_Airline Not Cancelled Cancelled
0 9E 136805 5819
1 AA 477217 18736
2 AS 125772 3738
3 B6 149162 7631
4 C5 42849 1518
In [14]:
cancel2 = (
    (
        df_22.query('Year == 2022')
        .groupby(['Operating_Airline','Cancelled'])
        .size()
        .unstack()
        .reset_index()
    )
    .rename(columns={False: 'Not Cancelled', True: 'Cancelled'})
    .rename_axis(None, axis=1)
)
In [15]:
cancel2.head()
Out[15]:
Operating_Airline Not Cancelled Cancelled
0 9E 136805 5819
1 AA 477217 18736
2 AS 125772 3738
3 B6 149162 7631
4 C5 42849 1518
In [16]:
cancel3 = cancel2.assign(Total=cancel2["Not Cancelled"] + cancel2['Cancelled'])
cancel3.head()
Out[16]:
Operating_Airline Not Cancelled Cancelled Total
0 9E 136805 5819 142624
1 AA 477217 18736 495953
2 AS 125772 3738 129510
3 B6 149162 7631 156793
4 C5 42849 1518 44367
In [17]:
cancel4 = cancel3.assign(pct_cancelled=(cancel3["Cancelled"] / cancel3['Total'])*100)
cancel4.head()
Out[17]:
Operating_Airline Not Cancelled Cancelled Total pct_cancelled
0 9E 136805 5819 142624 4.079958
1 AA 477217 18736 495953 3.777777
2 AS 125772 3738 129510 2.886264
3 B6 149162 7631 156793 4.866926
4 C5 42849 1518 44367 3.421462
In [18]:
airline_map = df_airlines.set_index('Code')['Description']
airline_map
Out[18]:
Code
02Q                       Titan Airways
04Q                  Tradewind Aviation
05Q                 Comlux Aviation, AG
06Q       Master Top Linhas Aereas Ltd.
07Q                 Flair Airlines Ltd.
                      ...              
ZW          Air Wisconsin Airlines Corp
ZX                         Air Georgian
ZX (1)                       Airbc Ltd.
ZY               Atlantic Gulf Airlines
ZYZ                Skyway Aviation Inc.
Name: Description, Length: 1571, dtype: object
In [19]:
cancel['Airline'] = cancel["Operating_Airline"].map(airline_map)
cancel['Airline']
Out[19]:
0                             Endeavor Air Inc.
1                        American Airlines Inc.
2                          Alaska Airlines Inc.
3                               JetBlue Airways
4     Commutair Aka Champlain Enterprises, Inc.
5                          Delta Air Lines Inc.
6                        Frontier Airlines Inc.
7                                 Allegiant Air
8      GoJet Airlines, LLC d/b/a United Express
9                        Hawaiian Airlines Inc.
10                                    Envoy Air
11                             Spirit Air Lines
12                                  Comair Inc.
13                        SkyWest Airlines Inc.
14                  Capital Cargo International
15                                  Horizon Air
16                        United Air Lines Inc.
17                       Southwest Airlines Co.
18                           Mesa Airlines Inc.
19                            Republic Airlines
20                  Air Wisconsin Airlines Corp
Name: Airline, dtype: object
In [20]:
cancel5 = cancel4.assign(pct_cancelled=(cancel4["Cancelled"] / cancel4['Total'])*100).assign(Airline=cancel4["Operating_Airline"].map(airline_map))
cancel5.head().head()
Out[20]:
Operating_Airline Not Cancelled Cancelled Total pct_cancelled Airline
0 9E 136805 5819 142624 4.079958 Endeavor Air Inc.
1 AA 477217 18736 495953 3.777777 American Airlines Inc.
2 AS 125772 3738 129510 2.886264 Alaska Airlines Inc.
3 B6 149162 7631 156793 4.866926 JetBlue Airways
4 C5 42849 1518 44367 3.421462 Commutair Aka Champlain Enterprises, Inc.
In [21]:
cancel5.set_index('Airline').sort_values('Cancelled')['Cancelled'].plot(kind='barh', figsize=(10,5), title="Cancelation by Airline 2022")
Out[21]:
<AxesSubplot:title={'center':'Cancelation by Airline 2022'}, ylabel='Airline'>
In [22]:
fig, ax = plt.subplots(figsize=(10,10))
bars = (
cancel5.set_index('Airline').sort_values('pct_cancelled')['pct_cancelled'].plot(
    kind='barh', 
    figsize=(10,5), 
    title="% Cancelation by Airline 2022",
    width=0.7,
    edgecolor='black'
))

ax.bar_label(ax.containers[0], fmt='%0.2f%%', padding=-40, color='white')
plt.show()

How many Flights were delayed in the same period?¶

In [23]:
df_22['DelayGroup'] = None
df_22.loc[df_22['DepDelayMinutes'] == 0, 'DelayGroup'] = 'OnTime_Early'
df_22.loc[(df_22['DepDelayMinutes'] > 0) & (df_22['DepDelayMinutes'] <= 15), 'DelayGroup'] = 'Small_Delay'
df_22.loc[(df_22['DepDelayMinutes'] > 15) & (df_22['DepDelayMinutes'] <= 45), 'DelayGroup'] = 'Medium_Delay'
df_22.loc[df_22['DepDelayMinutes'] > 45, 'DelayGroup'] = 'Large_Delay'
df_22.loc[df_22['Cancelled'], 'DelayGroup'] = 'Cancelled'

# Generally 0-15 min delays are considered not significant. 
# Usually it can be made up for with an earlier arrival,
In [24]:
df_22['DelayGroup'].value_counts()
Out[24]:
OnTime_Early    2398098
Small_Delay      722122
Medium_Delay     445619
Large_Delay      389287
Cancelled        123192
Name: DelayGroup, dtype: int64
In [25]:
pal = sns.color_palette()
In [26]:
df_22['DelayGroup'].value_counts(ascending=True).plot(kind='barh', color=pal[5], title='Delay Category', figsize=(10,5))
Out[26]:
<AxesSubplot:title={'center':'Delay Category'}>
In [27]:
print((df_22['DelayGroup'].value_counts() / df_22.shape[0] * 100).round(2))
print(sum(df_22['DelayGroup'].value_counts() / df_22.shape[0] * 100))
OnTime_Early    58.80
Small_Delay     17.71
Medium_Delay    10.93
Large_Delay      9.55
Cancelled        3.02
Name: DelayGroup, dtype: float64
99.99999999999999
In [28]:
plt.figure(figsize=(8,6))
sns.countplot(data=df_22, x='DelayGroup')
plt.suptitle('Delay Groups')
Out[28]:
Text(0.5, 0.98, 'Delay Groups')
In [29]:
!pip install Pyppeteer
!pyppeteer-install
!pip install Flask Jinja2
Collecting Pyppeteer
  Using cached pyppeteer-1.0.2-py3-none-any.whl (83 kB)
Collecting pyee<9.0.0,>=8.1.0
  Using cached pyee-8.2.2-py2.py3-none-any.whl (12 kB)
Requirement already satisfied: certifi>=2021 in ./env/lib/python3.10/site-packages (from Pyppeteer) (2022.9.24)
Collecting importlib-metadata>=1.4
  Downloading importlib_metadata-5.1.0-py3-none-any.whl (21 kB)
Collecting appdirs<2.0.0,>=1.4.3
  Using cached appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Collecting tqdm<5.0.0,>=4.42.1
  Using cached tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
Collecting websockets<11.0,>=10.0
  Using cached websockets-10.4-cp310-cp310-macosx_10_9_x86_64.whl (97 kB)
Requirement already satisfied: urllib3<2.0.0,>=1.25.8 in ./env/lib/python3.10/site-packages (from Pyppeteer) (1.26.11)
Collecting zipp>=0.5
  Using cached zipp-3.10.0-py3-none-any.whl (6.2 kB)
Installing collected packages: pyee, appdirs, zipp, websockets, tqdm, importlib-metadata, Pyppeteer
Successfully installed Pyppeteer-1.0.2 appdirs-1.4.4 importlib-metadata-5.1.0 pyee-8.2.2 tqdm-4.64.1 websockets-10.4 zipp-3.10.0
chromium is already installed.
Collecting Flask
  Using cached Flask-2.2.2-py3-none-any.whl (101 kB)
Requirement already satisfied: Jinja2 in ./env/lib/python3.10/site-packages (3.0.3)
Collecting click>=8.0
  Using cached click-8.1.3-py3-none-any.whl (96 kB)
Collecting Werkzeug>=2.2.2
  Using cached Werkzeug-2.2.2-py3-none-any.whl (232 kB)
Collecting itsdangerous>=2.0
  Using cached itsdangerous-2.1.2-py3-none-any.whl (15 kB)
Requirement already satisfied: MarkupSafe>=2.0 in ./env/lib/python3.10/site-packages (from Jinja2) (2.1.1)
Installing collected packages: Werkzeug, itsdangerous, click, Flask
Successfully installed Flask-2.2.2 Werkzeug-2.2.2 click-8.1.3 itsdangerous-2.1.2
In [ ]: